資料準備流程

Fig-3: Data Preparation


Preparing The Predictors (X)

rm(list=ls(all=TRUE))
pacman::p_load(magrittr,latex2exp,Matrix,dplyr,tidyr,ggplot2,caTools,plotly)
load("data/tf0.rdata")
The Demarcation Date

Remove data after the demarcation date

feb01 = as.Date("2001-02-01")
Z = subset(Z0, date < feb01)    # 618212
Aggregate for the Transaction Records
X = group_by(Z, tid) %>% summarise(
  date = first(date),  # 交易日期
  cust = first(cust),  # 顧客 ID
  age = first(age),    # 顧客 年齡級別
  area = first(area),  # 顧客 居住區別
  items = n(),                # 交易項目(總)數
  pieces = sum(qty),          # 產品(總)件數
  total = sum(price),         # 交易(總)金額
  gross = sum(price - cost)   # 毛利
  ) %>% data.frame  # 88387
summary(X)
##       tid             date                cust               age           
##  Min.   :    1   Min.   :2000-11-01   Length:88387       Length:88387      
##  1st Qu.:22098   1st Qu.:2000-11-23   Class :character   Class :character  
##  Median :44194   Median :2000-12-12   Mode  :character   Mode  :character  
##  Mean   :44194   Mean   :2000-12-15                                        
##  3rd Qu.:66291   3rd Qu.:2001-01-12                                        
##  Max.   :88387   Max.   :2001-01-31                                        
##      area               items             pieces            total        
##  Length:88387       Min.   :  1.000   Min.   :  1.000   Min.   :    5.0  
##  Class :character   1st Qu.:  2.000   1st Qu.:  3.000   1st Qu.:  230.0  
##  Mode  :character   Median :  5.000   Median :  6.000   Median :  522.0  
##                     Mean   :  6.994   Mean   :  9.453   Mean   :  888.7  
##                     3rd Qu.:  9.000   3rd Qu.: 12.000   3rd Qu.: 1120.0  
##                     Max.   :112.000   Max.   :339.000   Max.   :30171.0  
##      gross        
##  Min.   :-1645.0  
##  1st Qu.:   23.0  
##  Median :   72.0  
##  Mean   :  138.3  
##  3rd Qu.:  174.0  
##  Max.   : 8069.0
Check Quantile and Remove Outlier
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))
##          items   pieces     total    gross
## 99.9%  56.0000  84.0000  9378.684 1883.228
## 99.95% 64.0000  98.0000 11261.751 2317.087
## 99.99% 85.6456 137.6456 17699.325 3389.646
X = subset(X, items<=64 & pieces<=98 & total<=11260) # 88387 -> 88295
Aggregate for Customer Records
d0 = max(X$date) + 1
A = X %>% mutate(
  days = as.integer(difftime(d0, date, units="days"))
  ) %>% 
  group_by(cust) %>% summarise(
    r = min(days),      # recency
    s = max(days),      # seniority
    f = n(),            # frquency
    m = mean(total),    # monetary
    rev = sum(total),   # total revenue contribution
    raw = sum(gross),   # total gross profit contribution
    age = age[1],       # age group
    area = area[1],     # area code
  ) %>% data.frame      # 28584
nrow(A)
## [1] 28584



Preparing the Target Variables (Y)

Aggregate Feb’s Transaction by Customer
feb = filter(X0, date>= feb01) %>% group_by(cust) %>% 
  summarise(amount = sum(total))  # 16900
The Target for Regression - A$amount

Simply a Left Joint

A = merge(A, feb, by="cust", all.x=T)
The Target for Classification - A$buy
A$buy = !is.na(A$amount)
table(A$buy, !is.na(A$amount))
##        
##         FALSE  TRUE
##   FALSE 15342     0
##   TRUE      0 13242
Summary of the Dataset
summary(A)
##      cust                 r               s               f         
##  Length:28584       Min.   : 1.00   Min.   : 1.00   Min.   : 1.000  
##  Class :character   1st Qu.:11.00   1st Qu.:47.00   1st Qu.: 1.000  
##  Mode  :character   Median :21.00   Median :68.00   Median : 2.000  
##                     Mean   :32.12   Mean   :61.27   Mean   : 3.089  
##                     3rd Qu.:53.00   3rd Qu.:83.00   3rd Qu.: 4.000  
##                     Max.   :92.00   Max.   :92.00   Max.   :60.000  
##                                                                     
##        m                rev             raw              age           
##  Min.   :    8.0   Min.   :    8   Min.   : -742.0   Length:28584      
##  1st Qu.:  359.4   1st Qu.:  638   1st Qu.:   70.0   Class :character  
##  Median :  709.5   Median : 1566   Median :  218.0   Mode  :character  
##  Mean   : 1012.4   Mean   : 2711   Mean   :  420.8                     
##  3rd Qu.: 1315.0   3rd Qu.: 3426   3rd Qu.:  535.0                     
##  Max.   :10634.0   Max.   :99597   Max.   :15565.0                     
##                                                                        
##      area               amount         buy         
##  Length:28584       Min.   :    8   Mode :logical  
##  Class :character   1st Qu.:  454   FALSE:15342    
##  Mode  :character   Median :  993   TRUE :13242    
##                     Mean   : 1499                  
##                     3rd Qu.: 1955                  
##                     Max.   :28089                  
##                     NA's   :15342



Preparing Train & Test Datasets

Train & Test Dataset
X = subset(X, cust %in% A$cust & date < as.Date("2001-02-01"))
Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))
set.seed(2018); spl = sample.split(A$buy, SplitRatio=0.7) 
c(nrow(A), sum(spl), sum(!spl))
## [1] 28584 20008  8576
cbind(A, spl) %>% filter(buy) %>%  
  ggplot(aes(x=log(amount))) + geom_density(aes(fill=spl), alpha=0.5)

A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
n = nrow(A2)
set.seed(2018); spl2 = 1:n %in% sample(1:n, round(0.7*n))
c(nrow(A2), sum(spl2), sum(!spl2))
## [1] 13242  9269  3973
cbind(A2, spl2) %>% 
  ggplot(aes(x=amount)) + geom_density(aes(fill=spl2), alpha=0.5)

save(Z, X, A, spl, spl2, file="data/tf3.rdata")


Fig-4: Data Spliting

Spliting for Classification
TR = subset(A, spl)
TS = subset(A, !spl)


Classification Model

glm1 = glm(buy ~ ., TR[,c(2:9, 11)], family=binomial()) 
summary(glm1)
## 
## Call:
## glm(formula = buy ~ ., family = binomial(), data = TR[, c(2:9, 
##     11)])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.7931  -0.8733  -0.6991   1.0384   1.8735  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.259e+00  1.261e-01  -9.985  < 2e-16 ***
## r            -1.227e-02  8.951e-04 -13.708  < 2e-16 ***
## s             9.566e-03  9.101e-04  10.511  < 2e-16 ***
## f             2.905e-01  1.593e-02  18.233  < 2e-16 ***
## m            -3.028e-05  2.777e-05  -1.090  0.27559    
## rev           4.086e-05  1.940e-05   2.106  0.03521 *  
## raw          -2.306e-04  8.561e-05  -2.693  0.00708 ** 
## agea29       -4.194e-02  8.666e-02  -0.484  0.62838    
## agea34        1.772e-02  7.992e-02   0.222  0.82456    
## agea39        7.705e-02  7.921e-02   0.973  0.33074    
## agea44        8.699e-02  8.132e-02   1.070  0.28476    
## agea49        1.928e-02  8.457e-02   0.228  0.81962    
## agea54        1.745e-02  9.323e-02   0.187  0.85155    
## agea59        1.752e-01  1.094e-01   1.602  0.10926    
## agea64        6.177e-02  1.175e-01   0.526  0.59904    
## agea69        2.652e-01  1.047e-01   2.533  0.01131 *  
## agea99       -1.419e-01  1.498e-01  -0.947  0.34347    
## areaz106     -4.105e-02  1.321e-01  -0.311  0.75603    
## areaz110     -2.075e-01  1.045e-01  -1.986  0.04703 *  
## areaz114      3.801e-02  1.111e-01   0.342  0.73214    
## areaz115      2.599e-01  9.682e-02   2.684  0.00727 ** 
## areaz221      1.817e-01  9.753e-02   1.863  0.06243 .  
## areazOthers  -4.677e-02  1.045e-01  -0.448  0.65435    
## areazUnknown -1.695e-01  1.232e-01  -1.375  0.16912    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27629  on 20007  degrees of freedom
## Residual deviance: 23295  on 19984  degrees of freedom
## AIC: 23343
## 
## Number of Fisher Scoring iterations: 5
pred =  predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5); cm
##        predict
## actual  FALSE TRUE
##   FALSE  3730  873
##   TRUE   1700 2273
acc.ts = cm %>% {sum(diag(.))/sum(.)}
c(1-mean(TS$buy) , acc.ts)  # 0.69998
## [1] 0.5367304 0.6999767
colAUC(pred, TS$buy)        # 0.7556
##                     [,1]
## FALSE vs. TRUE 0.7556038


Regression Model

A2 = subset(A, A$buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)
lm1 = lm(amount ~ ., TR2[,c(2:6,8:10)])
summary(lm1)
## 
## Call:
## lm(formula = amount ~ ., data = TR2[, c(2:6, 8:10)])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.02874 -0.23292  0.05011  0.28423  1.45108 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.1651034  0.0501997  23.209  < 2e-16 ***
## r             0.0003390  0.0003138   1.080  0.27999    
## s             0.0002380  0.0003164   0.752  0.45200    
## f             0.0266666  0.0018112  14.723  < 2e-16 ***
## m             0.5165187  0.0375332  13.762  < 2e-16 ***
## rev           0.0240517  0.0363911   0.661  0.50868    
## agea29        0.0471837  0.0252728   1.867  0.06194 .  
## agea34        0.0896806  0.0233116   3.847  0.00012 ***
## agea39        0.1203331  0.0229212   5.250 1.56e-07 ***
## agea44        0.1107960  0.0235428   4.706 2.56e-06 ***
## agea49        0.0649780  0.0244296   2.660  0.00783 ** 
## agea54        0.0838574  0.0266168   3.151  0.00163 ** 
## agea59        0.0395519  0.0310826   1.272  0.20323    
## agea64        0.0059463  0.0325943   0.182  0.85525    
## agea69       -0.0399961  0.0289299  -1.383  0.16685    
## agea99        0.0892782  0.0408041   2.188  0.02870 *  
## areaz106      0.0955455  0.0427171   2.237  0.02533 *  
## areaz110      0.0526326  0.0347075   1.516  0.12944    
## areaz114      0.0154046  0.0364721   0.422  0.67277    
## areaz115      0.0193686  0.0317954   0.609  0.54243    
## areaz221      0.0350306  0.0320485   1.093  0.27440    
## areazOthers   0.0385476  0.0344270   1.120  0.26288    
## areazUnknown  0.0130805  0.0387052   0.338  0.73541    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4249 on 9246 degrees of freedom
## Multiple R-squared:  0.2796, Adjusted R-squared:  0.2779 
## F-statistic: 163.1 on 22 and 9246 DF,  p-value: < 2.2e-16
r2.tr = summary(lm1)$r.sq
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) -  TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(R2train=r2.tr, R2test=r2.ts)
##   R2train    R2test 
## 0.2795931 0.2845795


製作變數、改進模型



###加入品項數與顧客分群

A00 = X %>% mutate(
  days = as.integer(difftime(d0, date, units="days"))
  ) %>% 
  group_by(cust) %>% summarise(
    r = min(days),s = max(days), f = n(), m = mean(total), rev = sum(total), raw = sum(gross), age = age[1],  area = area[1], items = mean(items), period=s-r ) %>% data.frame      # 28584
nrow(A00)
## [1] 28584
STS = c("N1","R1","R2","S1")
Status = function(rx,fx,sx,dx) {factor(
  ifelse(dx == 0,
         ifelse(rx <90  , "N1","S1"),
         ifelse( dx/fx <10 ,"R2","R1")), STS)} 
A1 = A00 %>%  mutate(group=Status(r,f,s,period))
N1 = A1 %>% filter(group=="N1")%>%pull(cust)
S1 = A1 %>% filter(group=="S1")%>%pull(cust)
R1 = A1 %>% filter(group=="R1")%>%pull(cust)
R2 = A1 %>% filter(group=="R2")%>%pull(cust)
數值型迴歸(用來預測購買金額) - A$amount

Simply a Left Joint

A1 = merge(A1, feb, by="cust", all.x=T)
類別型迴歸(用來預測回購機率) - A$buy
A1$buy = !is.na(A1$amount)
table(A1$buy, !is.na(A1$amount))
##        
##         FALSE  TRUE
##   FALSE 15342     0
##   TRUE      0 13242
Summary of the Dataset
summary(A1)
##      cust                 r               s               f         
##  Length:28584       Min.   : 1.00   Min.   : 1.00   Min.   : 1.000  
##  Class :character   1st Qu.:11.00   1st Qu.:47.00   1st Qu.: 1.000  
##  Mode  :character   Median :21.00   Median :68.00   Median : 2.000  
##                     Mean   :32.12   Mean   :61.27   Mean   : 3.089  
##                     3rd Qu.:53.00   3rd Qu.:83.00   3rd Qu.: 4.000  
##                     Max.   :92.00   Max.   :92.00   Max.   :60.000  
##                                                                     
##        m                rev             raw              age           
##  Min.   :    8.0   Min.   :    8   Min.   : -742.0   Length:28584      
##  1st Qu.:  359.4   1st Qu.:  638   1st Qu.:   70.0   Class :character  
##  Median :  709.5   Median : 1566   Median :  218.0   Mode  :character  
##  Mean   : 1012.4   Mean   : 2711   Mean   :  420.8                     
##  3rd Qu.: 1315.0   3rd Qu.: 3426   3rd Qu.:  535.0                     
##  Max.   :10634.0   Max.   :99597   Max.   :15565.0                     
##                                                                        
##      area               items           period      group          amount     
##  Length:28584       Min.   : 1.00   Min.   : 0.00   N1:11673   Min.   :    8  
##  Class :character   1st Qu.: 3.00   1st Qu.: 0.00   R1:10452   1st Qu.:  454  
##  Mode  :character   Median : 6.00   Median :17.00   R2: 6246   Median :  993  
##                     Mean   : 7.76   Mean   :29.14   S1:  213   Mean   : 1499  
##                     3rd Qu.:10.00   3rd Qu.:59.00              3rd Qu.: 1955  
##                     Max.   :64.00   Max.   :91.00              Max.   :28089  
##                                                                NA's   :15342  
##     buy         
##  Mode :logical  
##  FALSE:15342    
##  TRUE :13242    
##                 
##                 
##                 
## 
X1 = subset(X, cust %in% A1$cust & date < as.Date("2001-02-01"))
Z1 = subset(Z, cust %in% A1$cust & date < as.Date("2001-02-01"))
set.seed(2018); spl3 = sample.split(A1$buy, SplitRatio=0.7) #將資料用隨機數選擇後,進行切割
c(nrow(A1), sum(spl3), sum(!spl3))
## [1] 28584 20008  8576
A12 = subset(A1, buy) %>% mutate_at(c("m","rev","amount"), log10)
n = nrow(A12)
set.seed(2018); spl4 = 1:n %in% sample(1:n, round(0.7*n))
c(nrow(A12), sum(spl4), sum(!spl4))
## [1] 13242  9269  3973
Spliting for Classification
TR3 = subset(A1, spl3)
TS3 = subset(A1, !spl3)


Classification Model

glm2 = glm(buy ~ ., TR3[,c(2:10,12, 14)], family=binomial()) 
summary(glm2)
## 
## Call:
## glm(formula = buy ~ ., family = binomial(), data = TR3[, c(2:10, 
##     12, 14)])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.7678  -0.8627  -0.6774   1.0281   1.9573  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.355e+00  1.278e-01 -10.602  < 2e-16 ***
## r            -8.028e-03  1.595e-03  -5.033 4.83e-07 ***
## s             5.137e-03  1.604e-03   3.203  0.00136 ** 
## f             2.946e-01  1.883e-02  15.646  < 2e-16 ***
## m            -7.733e-05  3.397e-05  -2.276  0.02283 *  
## rev           4.385e-05  1.945e-05   2.255  0.02415 *  
## raw          -2.666e-04  8.654e-05  -3.081  0.00207 ** 
## agea29       -2.916e-02  8.685e-02  -0.336  0.73703    
## agea34        2.306e-02  8.009e-02   0.288  0.77344    
## agea39        7.815e-02  7.940e-02   0.984  0.32504    
## agea44        8.310e-02  8.151e-02   1.020  0.30793    
## agea49        1.678e-02  8.475e-02   0.198  0.84301    
## agea54        2.221e-02  9.340e-02   0.238  0.81203    
## agea59        1.864e-01  1.096e-01   1.700  0.08905 .  
## agea64        6.994e-02  1.176e-01   0.595  0.55200    
## agea69        2.725e-01  1.049e-01   2.597  0.00939 ** 
## agea99       -1.301e-01  1.501e-01  -0.867  0.38598    
## areaz106     -4.689e-02  1.326e-01  -0.354  0.72354    
## areaz110     -2.111e-01  1.049e-01  -2.013  0.04415 *  
## areaz114      3.772e-02  1.115e-01   0.338  0.73515    
## areaz115      2.577e-01  9.719e-02   2.651  0.00802 ** 
## areaz221      1.725e-01  9.792e-02   1.761  0.07819 .  
## areazOthers  -4.527e-02  1.049e-01  -0.432  0.66601    
## areazUnknown -1.667e-01  1.235e-01  -1.350  0.17701    
## items         1.088e-02  3.660e-03   2.973  0.00294 ** 
## groupR1       3.274e-01  7.295e-02   4.488 7.18e-06 ***
## groupR2       2.474e-01  5.418e-02   4.567 4.95e-06 ***
## groupS1      -4.725e-03  2.039e-01  -0.023  0.98152    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27629  on 20007  degrees of freedom
## Residual deviance: 23258  on 19980  degrees of freedom
## AIC: 23314
## 
## Number of Fisher Scoring iterations: 5
pred2 =  predict(glm2, TS3, type="response")
cm = table(actual = TS3$buy, predict = pred2 > 0.5); cm
##        predict
## actual  FALSE TRUE
##   FALSE  3641  962
##   TRUE   1602 2371
acc.ts = cm %>% {sum(diag(.))/sum(.)}
c(1-mean(TS3$buy) , acc.ts)  # 0.7010261
## [1] 0.5367304 0.7010261
colAUC(pred2, TS3$buy)        # 0.7564
##                     [,1]
## FALSE vs. TRUE 0.7563764
tsAUC2 = colAUC(pred2, y=TS3$buy, plotROC=T)


Regression Model

A12 = subset(A1, A1$buy) %>% mutate_at(c("m","rev","amount"), log10)
TR4 = subset(A12, spl4)
TS4 = subset(A12, !spl4)
lm2 = lm(amount ~ ., TR4[,c(2:6,8:10,12,13)])
summary(lm2)
## 
## Call:
## lm(formula = amount ~ ., data = TR4[, c(2:6, 8:10, 12, 13)])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98081 -0.22905  0.04637  0.28129  1.35949 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.365e+00  5.466e-02  24.981  < 2e-16 ***
## r            -6.103e-05  4.243e-04  -0.144 0.885626    
## s             4.504e-04  4.247e-04   1.060 0.289010    
## f             1.921e-02  2.155e-03   8.911  < 2e-16 ***
## m             2.613e-01  5.489e-02   4.761 1.96e-06 ***
## rev           1.950e-01  5.248e-02   3.716 0.000203 ***
## agea29        4.846e-02  2.515e-02   1.927 0.054038 .  
## agea34        9.036e-02  2.319e-02   3.896 9.85e-05 ***
## agea39        1.173e-01  2.280e-02   5.144 2.75e-07 ***
## agea44        1.038e-01  2.343e-02   4.431 9.50e-06 ***
## agea49        6.026e-02  2.431e-02   2.479 0.013197 *  
## agea54        8.177e-02  2.648e-02   3.088 0.002023 ** 
## agea59        3.953e-02  3.092e-02   1.278 0.201172    
## agea64        1.001e-02  3.244e-02   0.308 0.757716    
## agea69       -3.997e-02  2.879e-02  -1.389 0.165002    
## agea99        8.905e-02  4.061e-02   2.193 0.028345 *  
## areaz106      9.675e-02  4.250e-02   2.277 0.022831 *  
## areaz110      5.652e-02  3.453e-02   1.637 0.101757    
## areaz114      2.543e-02  3.630e-02   0.701 0.483631    
## areaz115      2.340e-02  3.164e-02   0.740 0.459558    
## areaz221      3.907e-02  3.189e-02   1.225 0.220456    
## areazOthers   4.569e-02  3.426e-02   1.334 0.182299    
## areazUnknown  2.253e-02  3.852e-02   0.585 0.558721    
## items         8.598e-03  1.068e-03   8.054 9.01e-16 ***
## groupR1      -1.140e-01  2.011e-02  -5.669 1.48e-08 ***
## groupR2      -1.058e-01  2.263e-02  -4.677 2.96e-06 ***
## groupS1       9.557e-02  7.310e-02   1.307 0.191090    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4227 on 9242 degrees of freedom
## Multiple R-squared:  0.2874, Adjusted R-squared:  0.2854 
## F-statistic: 143.3 on 26 and 9242 DF,  p-value: < 2.2e-16
r2.tr2 = summary(lm2)$r.sq
SST2 = sum((TS4$amount - mean(TR4$amount))^ 2)
SSE2 = sum((predict(lm2, TS4) -  TS4$amount)^2)
r2.ts2 = 1 - (SSE/SST)
c(R2train2=r2.tr2, R2test2=r2.ts2)
##  R2train2   R2test2 
## 0.2873732 0.2845795


進行預測

Fig-3: Prediction


Aggregate data 2000-12-01 ~ 2001~02-28.

load("data/tf0.rdata")
d0 = max(X0$date) + 1
B = X0 %>% 
  filter(date >= as.Date("2000-12-01")) %>% 
  mutate(days = as.integer(difftime(d0, date, units="days"))) %>% 
  group_by(cust) %>% summarise(
    r = min(days),      # recency
    s = max(days),      # seniority
    f = n(),            # frquency
    m = mean(total),    # monetary
    rev = sum(total),   # total revenue contribution
    raw = sum(gross),   # total gross profit contribution
    age = age[1],       # age group
    area = area[1],     # area code
    items = mean(items), 
    period=s-r
  ) %>% data.frame      # 28584
nrow(B)
## [1] 28531
STS = c("N1","R1","R2","S1")
Status = function(rx,fx,sx,dx) {factor(
  ifelse(dx == 0,
         ifelse(rx <90  , "N1","S1"),
         ifelse( dx/fx <10 ,"R2","R1")), STS)} 
B1 = B %>%  mutate(group=Status(r,f,s,period))
N1 = B1 %>% filter(group=="N1")%>%pull(cust)
S1 = B1 %>% filter(group=="S1")%>%pull(cust)
R1 = B1 %>% filter(group=="R1")%>%pull(cust)
R2 = B1 %>% filter(group=="R2")%>%pull(cust)

In B, there is a record for each customer. B$Buy is the probability of buying in March.

B1$Buy = predict(glm1, B1, type="response")

💡: 預測購買金額時要記得做指數、對數轉換!

B2 = B1 %>% mutate_at(c("m","rev"), log10)
B1$Rev = 10^predict(lm1, B2)
par(mfrow=c(1,2), cex=0.8)
hist(B1$Buy)
hist(log(B1$Rev,10))

save(B1, file='data/tf4.rdata')
B1$Buy2 = predict(glm2, B1, type="response")
B1$Rev2 = 10^predict(lm2, B2)
par(mfrow=c(1,2), cex=0.8)
hist(B1$Buy2)
hist(log(B1$Rev2,10))

hist(B1$Buy)
hist(log(B1$Rev,10))

B1 %>% group_by(group)%>%summarise(buy=mean(Buy2),rev=mean(Rev2)) #各族群的回購機率和預期購買金額
## # A tibble: 4 x 3
##   group   buy   rev
##   <fct> <dbl> <dbl>
## 1 N1    0.259  912.
## 2 R1    0.592  949.
## 3 R2    0.641 1122.
## 4 S1    0.232 1133.
sum(B1$raw)/sum(B1$rev) #公司目前獲利率
## [1] 0.1567946
g = 0.15   # (稅前)獲利率
N = 5     # 期數 = 5
d = 0.1   # 利率 = 10%
B1$CLV = g * B1$Rev2 * rowSums(sapply(
  0:N, function(i) (B1$Buy2/(1+d))^i ) )

summary(B1$CLV) #估計顧客終生價值(CLV)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    17.46   129.35   210.69   304.16   352.63 17338.44
par(mar=c(2,2,3,1), cex=0.8)
hist(log(B1$CLV,10), xlab="", ylab="")

# 各族群的平均營收貢獻、保留機率、終生價值
B1 %>% group_by(group) %>% summarise_at(vars(Buy2:CLV), mean)
## # A tibble: 4 x 4
##   group  Buy2  Rev2   CLV
##   <fct> <dbl> <dbl> <dbl>
## 1 N1    0.259  912.  178.
## 2 R1    0.592  949.  317.
## 3 R2    0.641 1122.  524.
## 4 S1    0.232 1133.  215.

###繪製顧客終生價值對顧客狀態分群的盒狀圖。

par(mar=c(3,3,4,2), cex=0.8)
boxplot(log(CLV,10)~group, B1, main="CLV by Groups")

估計每位顧客的淨收益 \(\hat{R}(x)\)

m=0.2; b=25; a=40; x=30
DP = function(x,m0,b0,a0) {m0*plogis((10/a0)*(x-b0))}
dp = pmin(1-B1$Buy2, DP(x,m,b,a))
eR = dp*B1$Rev2*g - x
hist(eR,main="預期淨收益分佈",xlab="預期淨收益",ylab="顧客人數")

mm=c(0.20, 0.25, 0.15, 0.25)
bb=c(  25,   30,   15,   30)
aa=c(  40,   40,   30,   60) 
X = seq(0,60,2) 
do.call(rbind, lapply(1:length(mm), function(i) data.frame(
  Inst=paste0('Inst',i), Cost=X, 
  Gain=DP(X,mm[i],bb[i],aa[i])
  ))) %>% data.frame %>% 
  ggplot(aes(x=Cost, y=Gain, col=Inst)) +
  geom_line(size=1.5,alpha=0.5) + theme_bw() +
  ggtitle("Prob. Function: f(x|m,b,a)")

X = seq(10, 60, 1) 
df = do.call(rbind, lapply(1:length(mm), function(i) {
  sapply(X, function(x) {
    dp = pmin(1-B1$Buy2, DP(x,mm[i],bb[i],aa[i]))
    eR = dp*B1$Rev2*g - x
    c(i=i, x=x, eR.ALL=sum(eR), N=sum(eR>0), eR.SEL=sum(eR[eR > 0]) )
    }) %>% t %>% data.frame
  })) 

df %>% 
  mutate_at(vars(eR.ALL, eR.SEL), function(y) round(y/1000)) %>% 
  gather('key','value',-i,-x) %>% 
  mutate(Instrument = paste0('I',i)) %>%
  ggplot(aes(x=x, y=value, col=Instrument)) + 
  geom_hline(yintercept=0, linetype='dashed', col='blue') +
  geom_line(size=1.5,alpha=0.5) + 
  xlab('工具選項(成本)') + ylab('預期收益($K)') + 
  ggtitle('行銷工具優化','假設行銷工具的效果是其成本的函數') +
    facet_wrap(~key,ncol=1,scales='free_y') + theme_bw() -> p

plotly::ggplotly(p)
ci = sapply(
  list(c("N1"),c("R1"),
       c("R2"),c("S1")), 
  function(v) B1$group %in% v)  

X2 = seq(10, 60, 1) 
df = do.call(rbind, lapply(1:length(mm), function(i) {
  sapply(X2, function(x) {
    dp = pmin(1- B1$Buy2[ ci[,i] ]  , DP(x,mm[i],bb[i],aa[i]))
    eR = dp* B1$Rev2[ ci[,i] ]  *g - x
    c(i=i, x=x, eR.ALL=sum(eR), N=sum(eR>0), eR.SEL=sum(eR[eR > 0]) )
    }) %>% t %>% data.frame
  })) 

group_by(df, i) %>% top_n(1,eR.SEL)
## # A tibble: 4 x 5
## # Groups:   i [4]
##       i     x   eR.ALL     N eR.SEL
##   <dbl> <dbl>    <dbl> <dbl>  <dbl>
## 1     1    35 -114289.  2405 43944.
## 2     2    40  -85275.  2554 41533.
## 3     3    21  -35312.  1341 10911.
## 4     4    44    -573.    30   801.


🚴 討論行銷方案:
如果上述4組工具參數分別是4種行銷工具對4個顧客族群的效果:
  ■ I1 : N1
  ■ I2 : R1
  ■ I3 : R2
  ■ I4 : S1
針對這4個顧客族群之中選擇行銷對象:
  ■ 選擇行銷對象(N)
    N1族群中的2405位顧客;
    R1族群中的2554位顧客;
    R2族群中的1341位顧客;
    S1族群中的30位顧客,總共6330位顧客做行銷。
  ■ 設定行銷工具的面額(x)
    N1族群發送35元價值的折價券;
    R1族群發送40元價值的滿額外送免運券;
    R2族群發送21元價值的組合包優惠;
    S1族群發送44元價值的試吃包。
  ■ 估計預期報償(eR.SEL)?
    預計N1族群帶來報償為43943.6元
    預計R1族群帶來報償為41533.04元
    預計R2族群帶來報償為10911.4元
    預計S1族群帶來報償為801.18元,總預期報償為97,189.22元。